############### ###############
## 03 -  DM cleaning
## Project: CBO
## Author: Kamil Kouhen
## Purpose: Cleaning and management of DM data
## Date of creation: 09/03/2022
############### ###############

library(here)
#Running master file and ad-hoc function rcodes
#source(here("Code", "Rcode", "Master.R"), echo = T) #Master (contains necessary packages)

### Missing characters as NA ###
DM_intermediate <- DM_raw %>%
  mutate(across(all_of(DM_raw %>% select_if(is.character) %>% colnames), ~ifelse(.=="", NA, as.character(.))))

### Checking survey completion ###
  
  #Note# In DM data, data provider used multiple categories for survey_found variable

#Only keeping respondents who expressed consent (those who expressed content are all marked as found)
if (nrow(DM_intermediate %>% filter(DM_intermediate$survey_found != "found" & DM_intermediate$consent == "yes")) > 0) stop("There are obs who expressed consent but who are not marked as found, please check and correct if necessary")
DM_intermediate %<>% 
  filter(consent == "yes")

table(DM_intermediate$survey_found, useNA = "always") 
if (nrow(DM_intermediate %>% filter(DM_intermediate$survey_found != "found")) > 0) stop("There are still survey_found values other than 'found', please check why.")

table(DM_intermediate$consent, DM_intermediate$survey_found)

### Only keeping variables that could be useful for CBO analysis ###
DM_intermediate %<>% 
  dplyr::select(-(contains("survey") & !contains("survey_found"))) %>%
  dplyr::select(-(contains("begin") | contains("end_")), -question) %>%
  dplyr::select(-(contains("legit") | 
                    contains("muniperf") | 
                    contains("image_") | 
                    contains("traits") |
                    contains("coin") |
                    contains("experiment") | 
                    contains("transfer_") | 
                    contains("corr_") |
                    contains("threat") |
                    contains("__0") |
                    contains("dup") |
                    contains("screenshot"))) #For other studies
 
#Creating separate ds to run random integrity checks throughout the code
DM_forchecks <- DM_intermediate

### Changing order of variables for easier use ###
DM_intermediate %<>% 
  dplyr::select(id_DM, region, commune, everything())

### Making sure id variables are put character vars ###
tocharacter <- DM_intermediate %>% 
  dplyr::select(id_DM, contains("link"), appcode_treat, appcode_ctrl) %>% colnames
DM_intermediate[tocharacter] <- lapply(DM_intermediate[tocharacter], as.character) 

#Identifying all-missing numeric variables
if (length(DM_intermediate %>% dplyr::select_if(function(x) all(is.na(x) )) %>% colnames()) == 0) {
  print("None of the numeric variables is all missing")
} else {
  print("The following variables are all missing, please take care of them:")
  DM_intermediate %>% dplyr::select_if(function(x) all(is.na(x))) %>% colnames()
} #Not doing anything to them for now

### Random check of integrity of dataset ###
if(nrow(DM_forchecks) != nrow(DM_intermediate)) stop("Something went wrong: some observations were dropped since the first creation of DM_intermediate")
if(length(DM_forchecks %>% colnames) > length(DM_intermediate %>% colnames)) stop("Something went wrong: some variables were dropped since the first creation of DM_intermediate")
if(length(setdiff((DM_forchecks %>% group_by(commune) %>% mutate(countn = n()))$countn, (DM_intermediate %>% group_by(commune) %>% mutate(countn = n()))$countn))) stop("Something went wrong: Inconsistent number of obs per commune.")
###                                      ####   

### Looking for non standard missing values (e.g. negative such as -999 or -888 or -97) ###
if (length(DM_intermediate %>% 
             mutate_all(function(x) Hmisc::all.is.numeric(x, what = "vector", extras = NA)) %>% 
             select_if(is.numeric) %>% 
             select(!ends_with("_BL"), !ends_with("_S"), !ends_with("_MON")) %>% #Not looking at baseline, monitoring and SUPERMUN variables
             keep(~any(as.numeric(.x) <0 & !is.na(.x))) %>% 
             select(-contains("recordgps")) %>%
             names()) > 0){
  print("Some numeric variables contain values that seem to be non-standard missing (e.g. -999, or -97)")
  
  #Per the codebook, many variables used -999, -888 & -97 to record missing values
  testNA <- DM_intermediate %>% 
    mutate_all(function(x) Hmisc::all.is.numeric(x, what = "vector", extras = NA)) %>% 
    select(-ends_with("_BL"), -ends_with("_S"), -ends_with("_MON")) %>%
    select_if(is.numeric) %>% 
    select(-contains("recordgps")) %>%
    keep(~any(as.numeric(.x) <0 & !is.na(.x))) #Displaying name of columns in this case
  
  unique(testNA[testNA < 0]) #Displaying values in this case
  rm(testNA)
  
  #Note# -1 and -2 values are legitimate, they are part of multi-choice questions
  
  #Replacing all -999, -888 & -97 values by NA for all variables
  DM_intermediate %<>%
    mutate(across(all_of(DM_intermediate %>% 
                           select_if(is.character) %>% 
                           select(-ends_with("_BL"), -ends_with("_S"), -ends_with("_MON")) %>%
                           colnames), ~ifelse(.=="-999", NA, .))) %>%
    mutate(across(all_of(DM_intermediate %>% 
                           select_if(is.character) %>% 
                           select(-ends_with("_BL"), -ends_with("_S"), -ends_with("_MON")) %>%
                           colnames), ~ifelse(.=="-888", NA, .))) %>%
    mutate(across(all_of(DM_intermediate %>% 
                           select_if(is.character) %>% 
                           select(-ends_with("_BL"), -ends_with("_S"), -ends_with("_MON")) %>%
                           colnames), ~ifelse(.=="-97", NA, .))) %>%
    mutate(across(all_of(DM_intermediate %>% 
                           select_if(is.numeric) %>% 
                           select(-ends_with("_BL"), -ends_with("_S"), -ends_with("_MON")) %>%
                           colnames), ~ifelse(.==-999, NA, .))) %>%
    mutate(across(all_of(DM_intermediate %>% 
                           select_if(is.numeric) %>% 
                           select(-ends_with("_BL"), -ends_with("_S"), -ends_with("_MON")) %>%
                           colnames), ~ifelse(.==-888, NA, .))) %>%
    mutate(across(all_of(DM_intermediate %>% 
                           select_if(is.numeric) %>% 
                           select(-ends_with("_BL"), -ends_with("_S"), -ends_with("_MON")) %>%
                           colnames), ~ifelse(.==-97, NA, .)))
  
  #I have also found "." values, changing them to NA
  DM_intermediate %<>%
    mutate(across(all_of(DM_intermediate %>% select_if(is.character) %>% colnames), ~ifelse(.==".", NA, .)))
  
  #Checking if there are other types of non-standard missing values
  if (length(DM_intermediate %>% select_if(is.numeric) %>% 
             keep(~any((.x < 0 & .x != -1 & .x != -2) & !is.na(.x))) %>% 
             select(-ends_with("_BL"), -ends_with("_S"), -ends_with("_MON")) %>%
             select(-contains("recordgps")) %>%
             names()) != 0){ 
    stop("There still seem to be non-standard missing values, please check before continuing.") #Displaying them
  }
  else{
    print("All non-standard missing values have been identified.")
  }
}

### Identifying chr variables that should be numeric (all values are numeric or "NA") and putting them as numeric ###
charactervars <- DM_intermediate %>% 
  dplyr::select_if(is.character) %>% 
  dplyr::select(-all_of(tocharacter)) %>% #Not converting id and time vars
  colnames()

DM_intermediate %<>% #converting if all is numeric
  dplyr::mutate(across(all_of(charactervars), function(x) Hmisc::all.is.numeric(x, what = "vector", extras = NA)))

(check <- subset(charactervars, charactervars %in% (DM_intermediate %>% select_if(is.numeric) %>% colnames))) #Checking which ones were converted
length(check) #54 variables were converted

rm(tocharacter, charactervars, check)

### Identifying variables to be recoded as factor (e.g. if it contains a Yes/No pattern) ###
#Yes/no types
prefactor <- DM_intermediate %>% 
  dplyr::select_if(grepl("yes|no|Yes|No|YES|NO|oui|non|Oui|Non|OUI|NON", DM_intermediate)) %>% 
  dplyr::select_if(function(x) all(max(nchar(x)) < 8)) %>% 
  colnames() ##Identifying variables that are susceptible to be converted as factor (small maximum number of string)
#map(prefactor, unique) #The only non "yes" or "no" string is "dk" for "don't know". I leave them for now before getting's Malte's view on how to deal with these missing values
DM_intermediate[prefactor] <- lapply(DM_intermediate[prefactor], factor)  ## as.factor() could also be used
rm(prefactor)

#Binary numeric vars (0/1 or 1/2 types)
(binary01_should_be_factor <- DM_intermediate %>%
    dplyr::select_if(is.numeric) %>%
    dplyr::select_if(function(x) (length(unique(na.omit(x))) == 2)) %>%
    dplyr::select_if(~max(., na.rm = TRUE) == 1) %>%
    dplyr::select_if(~min(., na.rm = TRUE) >= 0) %>%
    colnames()) 

(binary12_should_be_factor <- DM_intermediate %>% 
    dplyr::select_if(is.numeric) %>%
    dplyr::select_if(function(x) (length(unique(na.omit(x))) == 2)) %>%
    dplyr::select_if(~max(., na.rm = TRUE) == 2) %>%
    dplyr::select_if(~min(., na.rm = TRUE) >= 1) %>%
    colnames()) 

DM_intermediate[binary01_should_be_factor] <- lapply(DM_intermediate[binary01_should_be_factor], factor)
DM_intermediate[binary12_should_be_factor] <- lapply(DM_intermediate[binary12_should_be_factor], factor)

rm(binary01_should_be_factor, binary12_should_be_factor) #Cleaning the environment

#Categ variables in the -2 to 2 scale or any different scale that seems categorical (max below or equal to 2)
(categnum_should_be_factor <- DM_intermediate %>% 
    dplyr::select_if(is.numeric) %>%
    dplyr::select_if(function(x) (length(unique(na.omit(x))) > 3)) %>%
    dplyr::select_if(~max(., na.rm = TRUE) <= 2) %>%
    dplyr::select_if(~min(., na.rm = TRUE) >= -2) %>%
    colnames())

DM_intermediate[categnum_should_be_factor] <- lapply(DM_intermediate[categnum_should_be_factor], factor)
rm(categnum_should_be_factor)
  #Note# A lot of categorical variables were recorded in a strange way which makes them undistinguishable to a numeric variables
       #I am using the questionnaire to spot the remaining categorical variables

#Identifying numeric variables that should be categorical (factor) using the questionnaire
DM_questionnaire <- as_tibble(readxl::read_excel(here("Supporting Documents", "IPA deliverables", "3_Questionnaires finaux OCB & DECIDEURS", "Decision_maker_survey_endline_test19.xlsx"))) #Importing questionnaire
selectvars <- DM_questionnaire %>% 
  dplyr::filter(if_all(type, ~ grepl('select', .))) %>% 
  dplyr::filter(!(if_all(type, ~ grepl('phone', .)))) %>%
  dplyr::select(name)

shouldbefactor <- DM_intermediate %>% 
  dplyr::select_if(is.numeric) %>%
  dplyr::select(any_of(selectvars$name)) %>%  #Numeric variables that are identified as "select" questions in IPA questionnaire
  colnames

DM_intermediate[shouldbefactor] <- lapply(DM_intermediate[shouldbefactor], factor)

#Same with character variables (those select_one type vars)
selectvars <- DM_questionnaire %>% 
  dplyr::filter(if_all(type, ~ grepl('select_one', .))) %>% 
  dplyr::select(name)

shouldbefactor <- DM_intermediate %>% 
  dplyr::select_if(is.character) %>%
  dplyr::select(any_of(selectvars$name)) %>%  #Numeric variables that are identified as "select" questions in IPA questionnaire
  colnames

DM_intermediate[shouldbefactor] <- lapply(DM_intermediate[shouldbefactor], factor)

rm(shouldbefactor, selectvars)

#I am also changing the "rfs" for "refuse to answer" and the don't know into NAs
DM_intermediate %<>%
  na_if("rfs") %>%
  na_if("dnk") %>%
  droplevels

#Writing yes as "1", no as "0", and dk as "Don't Know"
#I have to do it one by one (to only select variables containing the specific level to change, couldn't find how to do it otherwise)
tochange <- DM_intermediate %>% dplyr::select_if(~ is.factor(.) && any(c("yes") %in% levels(.))) %>% colnames()
DM_intermediate %<>% 
  mutate_at(.vars = vars(all_of(tochange)),
            .funs = forcats::fct_recode,
            "1" = "yes") 

tochange <- DM_intermediate %>% dplyr::select_if(~ is.factor(.) && any(c("no") %in% levels(.))) %>% colnames()
DM_intermediate %<>% 
  mutate_at(.vars = vars(all_of(tochange)),
            .funs = forcats::fct_recode,
            "0" = "no") 

rm(tochange)

### Random check of integrity of dataset ###
if(nrow(DM_forchecks) != nrow(DM_intermediate)) stop("Something went wrong: some observations were dropped since the first creation of DM_intermediate")
if(length(DM_forchecks %>% colnames) > length(DM_intermediate %>% colnames)) stop("Something went wrong: some variables were dropped since the first creation of DM_intermediate")
if(length(setdiff((DM_forchecks %>% group_by(commune) %>% mutate(countn = n()))$countn, (DM_intermediate %>% group_by(commune) %>% mutate(countn = n()))$countn))) stop("Something went wrong: Inconsistent number of obs per commune.")
###                                      ####  

### Using questionnaire to label variables ###
varlabels <- as_tibble(readxl::read_excel(here("Supporting Documents", "IPA deliverables","3_Questionnaires finaux OCB & DECIDEURS", "Decision_maker_survey_endline_test19.xlsx"), sheet = "survey")) %>% #Importing questionnaire
  dplyr::select(name, "label::English") %>%
  rename(label = "label::English") %>%
  dplyr::filter(name %in% colnames(DM_intermediate))

varlabels <- as_tibble(cbind(nms = names(varlabels), t(varlabels))) %>%  #Reshaping: the goal is to have a table with a column for each variable and the first row being the label. 
  janitor::row_to_names(row_number = 1) %>% #First row as variable name
  dplyr::select(-name)

# Note # Please ignore the following warning (should stop after first run, this warning is displayed once every 8 hours): The `x` argument of `as_tibble.matrix()` must have unique column names if `.name_repair` is omitted as of tibble 2.0.0.

DM_intermediate <- Hmisc::upData(DM_intermediate, labels = varlabels) #It worked (variables are labelled)
rm(varlabels)

### Ad-hoc function to create report with share of NAs for each variable in dataframe ###
share_NAs(DM_intermediate) #File exported in here("Output", "For Cleaning")

### Random check of integrity of dataset ###
if(nrow(DM_forchecks) != nrow(DM_intermediate)) stop("Something went wrong: some observations were dropped since the first creation of DM_intermediate")
if(length(DM_forchecks %>% colnames) > length(DM_intermediate %>% colnames)) stop("Something went wrong: some variables were dropped since the first creation of DM_intermediate")
if(length(setdiff((DM_forchecks %>% group_by(commune) %>% mutate(countn = n()))$countn, (DM_intermediate %>% group_by(commune) %>% mutate(countn = n()))$countn))) stop("Something went wrong: Inconsistent number of obs per commune.")
###                                      ####   

  #Note# Because of time pressures, I am not checking the distribution of the variables here
       # I will only check the distribution of the variables that are used in analysis

## Checking if some observations have too many missing values ##
todrop_test <- DM_intermediate %>%
  select(id_DM, language:abandonned16) %>%
  mutate(todrop_nb.NA = rowSums(is.na(.))) %>% 
  mutate(todrop_share.NA.analysis = todrop_nb.NA/(length(DM_intermediate %>% select(language:abandonned16))))

### User-written function for simple summary stats
sumstats(todrop_test$todrop_nb.NA) 
sumstats(todrop_test$todrop_share.NA.analysis) 

if (nrow(todrop_test %>% filter(todrop_share.NA.analysis > 0.9)) > 0) 
    message("There are observations with more than 90% of missing values, please check (they are flagged)")

#Ad-hoc function to report flag variable in external excel file
lotsofNA <- todrop_test %>%
  filter(todrop_share.NA.analysis > 0.9) #Only keeping observations with at least one var in this case

for (i in unique(lotsofNA$id_DM)){ ##Used for loop inside of map because map only saves the last iteration for some reason
  idss <- paste0(i)
  flagging(
    df = DM_intermediate, 
    selected.id = idss,
    as.NA = FALSE, #Not turning into NA for now
    remarks = "High number of missing values (> 90% of analytical variables)."
  )
  rm(idss)
}
rm(lotsofNA, todrop_test)
    
DM_intermediate %<>%
  select(-contains("todrop_"))

### Correcting issue with variable name inconsistencies ###
DM_intermediate %<>%
  rename(
    "know8_2" = know8_3,
    "know9_2" = know9_3,
    "know10_2" = know10_3,
  )

### Improving format respondent_type for figures ###
DM_intermediate %<>%
  mutate(respondent_type = case_when(
    respondent_type == "deputymayor" ~ "Deputy Mayor", 
    respondent_type == "mayor" ~ "Mayor",
    respondent_type == "opposition" ~ "Opposition Member",
    respondent_type == "sg" ~ "Secretary General"))

### General check of integrity of data set (before last meeting with IPA on 25/03)
share_NAs(DM_intermediate)

### Saving intermediate (pre-preparation for analysis) cleaned blinded DM dataset ###
saveRDS(DM_intermediate, file = here(datatype, "Intermediate", "DM_intermediate.RDS"))

message("**03 completed")


